{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "social-parcel",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import hail as hl\n",
    "hl.init()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "portuguese-enclosure",
   "metadata": {},
   "source": [
    "NYGC 30x HighCov samples Hail Table:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "hollywood-princess",
   "metadata": {},
   "outputs": [],
   "source": [
    "ht_samples = hl.import_table(\n",
    "    \"gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/1000_Genomes_NYGC_30x_samples_ped_population.txt.bgz\", \n",
    "    delimiter=\"\\s+\",\n",
    "    impute=True\n",
    ")\n",
    "\n",
    "ht_samples = ht_samples.annotate(\n",
    "    FatherID = hl.if_else(ht_samples.FatherID == \"0\", \n",
    "                          hl.missing(hl.tstr), \n",
    "                          ht_samples.FatherID), \n",
    "    MotherID = hl.if_else(ht_samples.MotherID == \"0\", \n",
    "                          hl.missing(hl.tstr), \n",
    "                    ht_samples.MotherID),\n",
    "                                 Sex = hl.if_else(ht_samples.Sex == 1, \"male\", \"female\")\n",
    ")\n",
    "ht_samples = ht_samples.key_by(\"SampleID\")\n",
    "\n",
    "n_rows = ht_samples.count()\n",
    "n_partitions = ht_samples.n_partitions()\n",
    "\n",
    "ht_samples = ht_samples.annotate_globals(\n",
    "    metadata=hl.struct(\n",
    "        name=\"1000_Genomes_HighCov_samples\",\n",
    "        n_rows=n_rows,\n",
    "        n_partitions=n_partitions)\n",
    ")\n",
    "\n",
    "ht_samples.write(\"gs://hail-datasets-us/1000_Genomes_NYGC_30x_HighCov_samples.ht\", overwrite=False)\n",
    "ht_samples = hl.read_table(\"gs://hail-datasets-us/1000_Genomes_NYGC_30x_HighCov_samples.ht\")\n",
    "ht_samples.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ruled-processor",
   "metadata": {},
   "source": [
    "### Phased genotypes"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "elegant-maker",
   "metadata": {},
   "source": [
    "Creating MTs for the phased data is straightforward, as multiallelic variants were split during phasing."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "increasing-component",
   "metadata": {},
   "source": [
    "#### Autosomes (phased):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "excessive-library",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "mt = hl.import_vcf(\n",
    "    \"gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/1000_Genomes_NYGC_30x_phased_chr{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22}_GRCh38.vcf.bgz\",\n",
    "    reference_genome=\"GRCh38\"\n",
    ")\n",
    "\n",
    "n_rows, n_cols = mt.count()\n",
    "n_partitions = mt.n_partitions()\n",
    "\n",
    "mt = mt.annotate_globals(\n",
    "    metadata=hl.struct(\n",
    "        name=\"1000_Genomes_HighCov_autosomes\",\n",
    "        reference_genome=\"GRCh38\",\n",
    "        n_rows=n_rows,\n",
    "        n_cols=n_cols,\n",
    "        n_partitions=n_partitions\n",
    "    )\n",
    ")\n",
    "\n",
    "# Get list of INFO fields that are arrays\n",
    "known_keys = [x[0] for x in list(mt.row.info.items()) if \"array\" in str(x[1])]\n",
    "\n",
    "# Extract value from INFO array fields (all arrays are length 1)\n",
    "mt = mt.annotate_rows(\n",
    "    info = mt.info.annotate(\n",
    "        **{k: hl.or_missing(hl.is_defined(mt.info[k]),\n",
    "                            mt.info[k][0])\n",
    "           for k in known_keys}\n",
    "    )\n",
    ")\n",
    "\n",
    "mt = mt.checkpoint(\n",
    "    \"gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/autosomes_phased_GRCh38.mt\",\n",
    "    overwrite=False,\n",
    "    _read_if_exists=True\n",
    ")\n",
    "\n",
    "mt = mt.annotate_cols(**ht_samples[mt.s])\n",
    "mt = hl.sample_qc(mt)\n",
    "mt = hl.variant_qc(mt)\n",
    "\n",
    "mt.write(\"gs://hail-datasets-us/1000_Genomes/NYGC_30x/GRCh38/autosomes_phased.mt\", overwrite=False)\n",
    "mt = hl.read_matrix_table(\"gs://hail-datasets-us/1000_Genomes/NYGC_30x/GRCh38/autosomes_phased.mt\")\n",
    "mt.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "leading-directory",
   "metadata": {},
   "source": [
    "#### ChrX (phased):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "alien-medicaid",
   "metadata": {},
   "outputs": [],
   "source": [
    "mt = hl.import_vcf(\n",
    "    \"gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/1000_Genomes_NYGC_30x_phased_chrX_GRCh38.vcf.bgz\",\n",
    "    reference_genome=\"GRCh38\"\n",
    ")\n",
    "\n",
    "n_rows, n_cols = mt.count()\n",
    "n_partitions = mt.n_partitions()\n",
    "\n",
    "mt = mt.annotate_globals(\n",
    "    metadata=hl.struct(\n",
    "        name=\"1000_Genomes_HighCov_chrX\",\n",
    "        reference_genome=\"GRCh38\",\n",
    "        n_rows=n_rows,\n",
    "        n_cols=n_cols,\n",
    "        n_partitions=n_partitions\n",
    "    )\n",
    ")\n",
    "\n",
    "# Get list of INFO fields that are arrays\n",
    "known_keys = [x[0] for x in list(mt.row.info.items()) if \"array\" in str(x[1])]\n",
    "\n",
    "# Extract appropriate value from INFO array fields (all arrays are length 1)\n",
    "mt = mt.annotate_rows(\n",
    "    info = mt.info.annotate(\n",
    "        **{k: hl.or_missing(hl.is_defined(mt.info[k]),\n",
    "                            mt.info[k][0])\n",
    "           for k in known_keys}\n",
    "    )\n",
    ")\n",
    "\n",
    "mt = mt.checkpoint(\n",
    "    \"gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/chrX_phased_GRCh38.mt\",\n",
    "    overwrite=False,\n",
    "    _read_if_exists=True\n",
    ")\n",
    "\n",
    "mt = mt.annotate_cols(**ht_samples[mt.s])\n",
    "mt = hl.sample_qc(mt)\n",
    "mt = hl.variant_qc(mt)\n",
    "\n",
    "mt.write(\"gs://hail-datasets-us/1000_Genomes/NYGC_30x/GRCh38/chrX_phased.mt\", overwrite=False)\n",
    "mt = hl.read_matrix_table(\"gs://hail-datasets-us/1000_Genomes/NYGC_30x/GRCh38/chrX_phased.mt\")\n",
    "mt.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ideal-change",
   "metadata": {},
   "source": [
    "### Unphased genotypes"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "statutory-karaoke",
   "metadata": {},
   "source": [
    "#### Autosomes (unphased):"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "above-wales",
   "metadata": {},
   "source": [
    "Import chr1-chr22 VCF to `MatrixTable` and checkpoint:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "painful-virtue",
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "mt = hl.import_vcf(\n",
    "        (\"gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/1000_Genomes_NYGC_30x_\"\n",
    "         \"chr{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22}_\"\n",
    "         \"GRCh38.vcf.bgz\"),\n",
    "        reference_genome=\"GRCh38\",\n",
    "        array_elements_required=False\n",
    ")\n",
    "mt = mt.annotate_entries(\n",
    "    PL = hl.if_else(mt.PL.contains(hl.missing(hl.tint32)), \n",
    "                    hl.missing(mt.PL.dtype), \n",
    "                    mt.PL)\n",
    ")\n",
    "mt = mt.checkpoint(\n",
    "    \"gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/autosomes_unphased_GRCh38_imported_vcf.mt\", \n",
    "    overwrite=False, \n",
    "    _read_if_exists=True\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "original-admission",
   "metadata": {},
   "source": [
    "Separate biallelic and multiallelic variants, split multiallelic variants with `split_multi_hts`, and then `union_rows` the split multiallelic MT back to the biallelic MT. \n",
    "\n",
    "For multiallelic variants we will just set `PL` to be missing, to avoid running into index out of bounds errors in `split_multi_hts`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "diverse-march",
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "mt = hl.read_matrix_table(\n",
    "    \"gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/autosomes_unphased_GRCh38_imported_vcf.mt\"\n",
    ")\n",
    "\n",
    "bi = mt.filter_rows(hl.len(mt.alleles) == 2)\n",
    "bi = bi.annotate_rows(a_index=1, was_split=False)\n",
    "bi = bi.checkpoint(\n",
    "    \"gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/autosomes_unphased_GRCh38_biallelic.mt\", \n",
    "    overwrite=False, \n",
    "    _read_if_exists=True\n",
    ")\n",
    "\n",
    "multi = mt.filter_rows(hl.len(mt.alleles) > 2)\n",
    "multi = multi.annotate_entries(PL = hl.missing(multi.PL.dtype))\n",
    "multi = multi.checkpoint(\n",
    "    \"gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/autosomes_unphased_GRCh38_multiallelic.mt\", \n",
    "    overwrite=False,\n",
    "    _read_if_exists=True\n",
    ")\n",
    "\n",
    "split = hl.split_multi_hts(multi, keep_star=True, permit_shuffle=True)\n",
    "split = split.checkpoint(\n",
    "    \"gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/autosomes_unphased_GRCh38_multiallelic_split.mt\", \n",
    "    overwrite=False,     \n",
    "    _read_if_exists=True\n",
    ")\n",
    "\n",
    "unioned = split.union_rows(bi)\n",
    "unioned = unioned.checkpoint(\n",
    "    \"gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/autosomes_unphased_GRCh38_unioned.mt\", \n",
    "    overwrite=False,    \n",
    "    _read_if_exists=True\n",
    ")\n",
    "\n",
    "unioned = unioned.repartition(12000, shuffle=True)\n",
    "unioned = unioned.checkpoint(\n",
    "    \"gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/autosomes_unphased_GRCh38_unioned_repart.mt\", \n",
    "    overwrite=False,    \n",
    "    _read_if_exists=True\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "criminal-terry",
   "metadata": {},
   "source": [
    "After splitting multiallelic variants, we need to extract the appropriate values from the `INFO` array fields with `a_index`. \n",
    "\n",
    "Then annotate globals with metadata, annotate columns with sample relationships, perform `sample_qc` and `variant_qc`, and write final MT to `hail-datasets-us`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "conscious-society",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "unioned = hl.read_matrix_table(\n",
    "    \"gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/autosomes_unphased_GRCh38_unioned_repart.mt\"\n",
    ")\n",
    "\n",
    "# Get list of INFO fields that are arrays\n",
    "known_keys = [x[0] for x in list(unioned.row.info.items()) if \"array\" in str(x[1])]\n",
    "\n",
    "# Extract appropriate values from INFO array fields after splitting\n",
    "mt = unioned.annotate_rows(\n",
    "    info = unioned.info.annotate(\n",
    "        **{k: hl.or_missing(hl.is_defined(unioned.info[k]), \n",
    "                            unioned.info[k][unioned.a_index - 1]) \n",
    "           for k in known_keys}\n",
    "    )\n",
    ")\n",
    "\n",
    "n_rows, n_cols = mt.count()\n",
    "n_partitions = mt.n_partitions()\n",
    "\n",
    "mt = mt.annotate_globals(\n",
    "    metadata=hl.struct(\n",
    "        name=\"1000_Genomes_HighCov_autosomes\",\n",
    "        reference_genome=\"GRCh38\",\n",
    "        n_rows=n_rows,\n",
    "        n_cols=n_cols,\n",
    "        n_partitions=n_partitions\n",
    "    )\n",
    ")\n",
    "\n",
    "ht_samples = hl.read_table(\"gs://hail-datasets-us/1000_Genomes/NYGC_30x/samples.ht\")\n",
    "mt = mt.annotate_cols(**ht_samples[mt.s])\n",
    "mt = hl.sample_qc(mt)\n",
    "mt = hl.variant_qc(mt)\n",
    "\n",
    "mt.write(\"gs://hail-datasets-us/1000_Genomes/NYGC_30x/GRCh38/autosomes_unphased.mt\", overwrite=False)\n",
    "mt = hl.read_matrix_table(\"gs://hail-datasets-us/1000_Genomes/NYGC_30x/GRCh38/autosomes_unphased.mt\")\n",
    "mt.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "blank-dance",
   "metadata": {},
   "source": [
    "#### ChrX (unphased):"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "distributed-numbers",
   "metadata": {},
   "source": [
    "Import chrX VCF to `MatrixTable` and checkpoint:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "organized-bunny",
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "mt = hl.import_vcf(\n",
    "    \"gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/1000_Genomes_NYGC_30x_chrX_GRCh38.vcf.bgz\",\n",
    "    reference_genome=\"GRCh38\", \n",
    "    array_elements_required=False\n",
    ")\n",
    "mt = mt.annotate_entries(\n",
    "    PL = hl.if_else(mt.PL.contains(hl.missing(hl.tint32)), \n",
    "                    hl.missing(mt.PL.dtype), \n",
    "                    mt.PL)\n",
    ")\n",
    "mt = mt.checkpoint(\n",
    "    \"gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/chrX_unphased_GRCh38_imported_vcf.mt\", \n",
    "    overwrite=False, \n",
    "    _read_if_exists=True\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "official-doubt",
   "metadata": {},
   "source": [
    "Separate biallelic and multiallelic variants, split multiallelic variants with `split_multi_hts`, and then `union_rows` the split multiallelic MT back to the biallelic MT. \n",
    "\n",
    "For multiallelic variants we will just set `PL` to be missing, to avoid running into index out of bounds errors in `split_multi_hts`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "convertible-distribution",
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "mt = hl.read_matrix_table(\n",
    "    \"gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/chrX_unphased_GRCh38_imported_vcf.mt\"\n",
    ")\n",
    "\n",
    "bi = mt.filter_rows(hl.len(mt.alleles) == 2)\n",
    "bi = bi.annotate_rows(a_index=1, was_split=False)\n",
    "bi = bi.checkpoint(\n",
    "    \"gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/chrX_unphased_GRCh38_biallelic.mt\", \n",
    "    overwrite=False, \n",
    "    _read_if_exists=True\n",
    ")\n",
    "\n",
    "multi = mt.filter_rows(hl.len(mt.alleles) > 2)\n",
    "multi = multi.annotate_entries(PL = hl.missing(multi.PL.dtype))\n",
    "multi = multi.checkpoint(\n",
    "    \"gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/chrX_unphased_GRCh38_multiallelic.mt\", \n",
    "    overwrite=False,\n",
    "    _read_if_exists=True\n",
    ")\n",
    "\n",
    "split = hl.split_multi_hts(multi, keep_star=True, permit_shuffle=True)\n",
    "split = split.checkpoint(\n",
    "    \"gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/chrX_unphased_GRCh38_multiallelic_split.mt\", \n",
    "    overwrite=False,     \n",
    "    _read_if_exists=True\n",
    ")\n",
    "\n",
    "unioned = split.union_rows(bi)\n",
    "unioned = unioned.checkpoint(\n",
    "    \"gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/chrX_unphased_GRCh38_unioned.mt\", \n",
    "    overwrite=False,    \n",
    "    _read_if_exists=True\n",
    ")\n",
    "\n",
    "unioned = unioned.repartition(512, shuffle=True)\n",
    "unioned = unioned.checkpoint(\n",
    "    \"gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/chrX_unphased_GRCh38_unioned_repart.mt\", \n",
    "    overwrite=False,    \n",
    "    _read_if_exists=True\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "lonely-storm",
   "metadata": {},
   "source": [
    "After splitting multiallelic variants, we need to extract the appropriate values from the `INFO` array fields with `a_index`. \n",
    "\n",
    "Then annotate globals with metadata, annotate columns with sample relationships, perform `sample_qc` and `variant_qc`, and write final MT to `hail-datasets-us`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "beginning-outline",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "unioned = hl.read_matrix_table(\n",
    "    \"gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/chrX_unphased_GRCh38_unioned_repart.mt\"\n",
    ")\n",
    "\n",
    "# Get list of INFO fields that are arrays\n",
    "known_keys = [x[0] for x in list(unioned.row.info.items()) if \"array\" in str(x[1])]\n",
    "\n",
    "# Extract appropriate values from INFO array fields after splitting\n",
    "mt = unioned.annotate_rows(\n",
    "    info = unioned.info.annotate(\n",
    "        **{k: hl.or_missing(hl.is_defined(unioned.info[k]), \n",
    "                            unioned.info[k][unioned.a_index - 1]) \n",
    "           for k in known_keys}\n",
    "    )\n",
    ")\n",
    "\n",
    "n_rows, n_cols = mt.count()\n",
    "n_partitions = mt.n_partitions()\n",
    "\n",
    "mt = mt.annotate_globals(\n",
    "    metadata=hl.struct(\n",
    "        name=\"1000_Genomes_HighCov_chrX\",\n",
    "        reference_genome=\"GRCh38\",\n",
    "        n_rows=n_rows,\n",
    "        n_cols=n_cols,\n",
    "        n_partitions=n_partitions\n",
    "    )\n",
    ")\n",
    "\n",
    "ht_samples = hl.read_table(\"gs://hail-datasets-us/1000_Genomes/NYGC_30x/samples.ht\")\n",
    "mt = mt.annotate_cols(**ht_samples[mt.s])\n",
    "mt = hl.sample_qc(mt)\n",
    "mt = hl.variant_qc(mt)\n",
    "\n",
    "mt.write(\"gs://hail-datasets-us/1000_Genomes/NYGC_30x/GRCh38/chrX_unphased.mt\", overwrite=False)\n",
    "mt = hl.read_matrix_table(\"gs://hail-datasets-us/1000_Genomes/NYGC_30x/GRCh38/chrX_unphased.mt\")\n",
    "mt.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "existing-helping",
   "metadata": {},
   "source": [
    "#### ChrY (unphased):"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "suspected-savannah",
   "metadata": {},
   "source": [
    "Import chrY VCF to `MatrixTable` and checkpoint:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "distinguished-smooth",
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "mt = hl.import_vcf(\n",
    "    \"gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/1000_Genomes_NYGC_30x_chrY_GRCh38.vcf.bgz\",\n",
    "    reference_genome=\"GRCh38\", \n",
    "    array_elements_required=False\n",
    ")\n",
    "mt = mt.annotate_entries(\n",
    "    PL = hl.if_else(mt.PL.contains(hl.missing(hl.tint32)), \n",
    "                    hl.missing(mt.PL.dtype), \n",
    "                    mt.PL)\n",
    ")\n",
    "mt = mt.checkpoint(\n",
    "    \"gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/chrY_unphased_GRCh38_imported_vcf.mt\", \n",
    "    overwrite=False, \n",
    "    _read_if_exists=True\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "suspected-locator",
   "metadata": {},
   "source": [
    "Separate biallelic and multiallelic variants, split multiallelic variants with `split_multi_hts`, and then `union_rows` the split multiallelic MT back to the biallelic MT. \n",
    "\n",
    "For multiallelic variants we will just set `PL` to be missing, to avoid running into index out of bounds errors in `split_multi_hts`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "answering-nitrogen",
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "mt = hl.read_matrix_table(\n",
    "    \"gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/chrY_unphased_GRCh38_imported_vcf.mt\"\n",
    ")\n",
    "\n",
    "bi = mt.filter_rows(hl.len(mt.alleles) == 2)\n",
    "bi = bi.annotate_rows(a_index=1, was_split=False)\n",
    "bi = bi.checkpoint(\n",
    "    \"gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/chrY_unphased_GRCh38_biallelic.mt\", \n",
    "    overwrite=False, \n",
    "    _read_if_exists=True\n",
    ")\n",
    "\n",
    "multi = mt.filter_rows(hl.len(mt.alleles) > 2)\n",
    "multi = multi.annotate_entries(PL = hl.missing(multi.PL.dtype))\n",
    "multi = multi.checkpoint(\n",
    "    \"gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/chrY_unphased_GRCh38_multiallelic.mt\", \n",
    "    overwrite=False,\n",
    "    _read_if_exists=True\n",
    ")\n",
    "\n",
    "split = hl.split_multi_hts(multi, keep_star=True, permit_shuffle=True)\n",
    "split = split.checkpoint(\n",
    "    \"gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/chrY_unphased_GRCh38_multiallelic_split.mt\", \n",
    "    overwrite=False,     \n",
    "    _read_if_exists=True\n",
    ")\n",
    "\n",
    "unioned = split.union_rows(bi)\n",
    "unioned = unioned.checkpoint(\n",
    "    \"gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/chrY_unphased_GRCh38_unioned.mt\", \n",
    "    overwrite=False,    \n",
    "    _read_if_exists=True\n",
    ")\n",
    "\n",
    "unioned = unioned.repartition(8, shuffle=True)\n",
    "unioned = unioned.checkpoint(\n",
    "    \"gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/chrY_unphased_GRCh38_unioned_repart.mt\", \n",
    "    overwrite=False,    \n",
    "    _read_if_exists=True\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "professional-cleaning",
   "metadata": {},
   "source": [
    "After splitting multiallelic variants, we need to extract the appropriate values from the `INFO` array fields with `a_index`. \n",
    "\n",
    "Then annotate globals with metadata, annotate columns with sample relationships, perform `sample_qc` and `variant_qc`, and write final MT to `hail-datasets-us`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "alternate-motor",
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "unioned = hl.read_matrix_table(\n",
    "    \"gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/chrY_unphased_GRCh38_unioned_repart.mt\"\n",
    ")\n",
    "\n",
    "# Get list of INFO fields that are arrays\n",
    "known_keys = [x[0] for x in list(unioned.row.info.items()) if \"array\" in str(x[1])]\n",
    "\n",
    "# Extract appropriate values from INFO array fields after splitting\n",
    "mt = unioned.annotate_rows(\n",
    "    info = unioned.info.annotate(\n",
    "        **{k: hl.or_missing(hl.is_defined(unioned.info[k]), \n",
    "                            unioned.info[k][unioned.a_index - 1]) \n",
    "           for k in known_keys}\n",
    "    )\n",
    ")\n",
    "\n",
    "n_rows, n_cols = mt.count()\n",
    "n_partitions = mt.n_partitions()\n",
    "\n",
    "mt = mt.annotate_globals(\n",
    "    metadata=hl.struct(\n",
    "        name=\"1000_Genomes_HighCov_chrY\",\n",
    "        reference_genome=\"GRCh38\",\n",
    "        n_rows=n_rows,\n",
    "        n_cols=n_cols,\n",
    "        n_partitions=n_partitions\n",
    "    )\n",
    ")\n",
    "\n",
    "ht_samples = hl.read_table(\"gs://hail-datasets-us/1000_Genomes/NYGC_30x/samples.ht\")\n",
    "mt = mt.annotate_cols(**ht_samples[mt.s])\n",
    "mt = hl.sample_qc(mt)\n",
    "mt = hl.variant_qc(mt)\n",
    "\n",
    "mt.write(\"gs://hail-datasets-us/1000_Genomes/NYGC_30x/GRCh38/chrY_unphased.mt\", overwrite=False)\n",
    "mt = hl.read_matrix_table(\"gs://hail-datasets-us/1000_Genomes/NYGC_30x/GRCh38/chrY_unphased.mt\")\n",
    "mt.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "### Create/update schemas"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%% md\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "exposed-ivory",
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "import textwrap\n",
    "\n",
    "output_dir = os.path.abspath(\"../../hail/python/hail/docs/datasets/schemas\")\n",
    "datasets_path = os.path.abspath(\"../../hail/python/hail/experimental/datasets.json\")\n",
    "with open(datasets_path, \"r\") as f:\n",
    "    datasets = json.load(f)\n",
    "\n",
    "names = datasets.keys()\n",
    "for name in [name for name in names if \"1000_Genomes_HighCov\" in name]:\n",
    "    versions = sorted(set(dataset[\"version\"] for dataset in datasets[name][\"versions\"]))\n",
    "    if not versions:\n",
    "        versions = [None]\n",
    "    reference_genomes = sorted(set(dataset[\"reference_genome\"] for dataset in datasets[name][\"versions\"]))\n",
    "    if not reference_genomes:\n",
    "        reference_genomes = [None]\n",
    "\n",
    "    print(name)\n",
    "    # Create schemas for unphased versions, since phased entries only have GT\n",
    "    if name == \"1000_Genomes_HighCov_chrY\":\n",
    "        v = versions[0]\n",
    "    else:\n",
    "        v = versions[1]\n",
    "    print(v)\n",
    "    print(reference_genomes[0] + \"\\n\")\n",
    "\n",
    "    path = [dataset[\"url\"][\"gcp\"][\"us\"]\n",
    "            for dataset in datasets[name][\"versions\"]\n",
    "            if all([dataset[\"version\"] == v,\n",
    "                    dataset[\"reference_genome\"] == reference_genomes[0]])]\n",
    "    assert len(path) == 1\n",
    "    path = path[0]\n",
    "    if path.endswith(\".ht\"):\n",
    "        table = hl.methods.read_table(path)\n",
    "        table_class = \"hail.Table\"\n",
    "    else:\n",
    "        table = hl.methods.read_matrix_table(path)\n",
    "        table_class = \"hail.MatrixTable\"\n",
    "\n",
    "    description = table.describe(handler=lambda x: str(x)).split(\"\\n\")\n",
    "    description = \"\\n\".join([line.rstrip() for line in description])\n",
    "\n",
    "    template = \"\"\".. _{dataset}:\n",
    "\n",
    "{dataset}\n",
    "{underline1}\n",
    "\n",
    "*  **Versions:** {versions}\n",
    "*  **Reference genome builds:** {ref_genomes}\n",
    "*  **Type:** :class:`{class}`\n",
    "\n",
    "Schema ({version0}, {ref_genome0})\n",
    "{underline2}\n",
    "\n",
    ".. code-block:: text\n",
    "\n",
    "{schema}\n",
    "\n",
    "\"\"\"\n",
    "    context = {\n",
    "        \"dataset\": name,\n",
    "        \"underline1\": len(name) * \"=\",\n",
    "        \"version0\": v,\n",
    "        \"ref_genome0\": reference_genomes[0],\n",
    "        \"versions\": \", \".join([str(version) for version in versions]),\n",
    "        \"ref_genomes\": \", \".join([str(reference_genome) for reference_genome in reference_genomes]),\n",
    "        \"underline2\": len(\"\".join([\"Schema (\", str(v), \", \", str(reference_genomes[0]), \")\"])) * \"~\",\n",
    "        \"schema\": textwrap.indent(description, \"    \"),\n",
    "        \"class\": table_class\n",
    "    }\n",
    "    with open(output_dir + f\"/{name}.rst\", \"w\") as f:\n",
    "        f.write(template.format(**context).strip())"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}